import pandas as pd
import numpy as np
import datetime
from constants import LAST_DATE_CONST, SAMPLING_RATE, STORAGE_DIR, FREQUENCY, LIMIT_INTERP

def preprocess(df: pd.DataFrame)-> None:

    df.index = pd.to_datetime(df['Timestamp'])
    df.drop(columns=['Timestamp'], inplace=True) 
    
    # remove duplicates (timestamps are rounded to nearest minute)
    df = df[~df.index.duplicated(keep='first')]

    # resample and interpolate time series data
    oidx = df.index
    nidx = pd.date_range(oidx.min(), oidx.max(), freq=SAMPLING_RATE)
    df_fill = df.loc[:,['VIN','Home']].resample(SAMPLING_RATE).ffill().bfill() # fill forward/backward categorical features
    df_fill.index = df_fill.index.floor('Min')
    df_interp = df.loc[:,['SOC','Mileage']].reindex(oidx.union(nidx)).interpolate('index', limit = LIMIT_INTERP).reindex(nidx)
    
    #Look at all -1s, look at beg and end mileage (not interpolated), 
    #if its less than x, call another interp function
    df_interp = df_interp.fillna(-1)
    df_interp.index = df_interp.index.floor('Min')

    df_interp["Timestamp"] = df_interp.index
    df_interp.index = range(0, len(df_interp))
    
    arr_idxs = df_interp.index[df_interp['Mileage'] == -1].tolist()
    if len(arr_idxs) > 0:
        index_dict = {}
        first_idx = arr_idxs[0]
        for i in range(len(arr_idxs)-1):
            if arr_idxs[i+1] - arr_idxs[i] == 1 and i+1 < len(arr_idxs)-1:
                continue

            elif i+1 == len(arr_idxs)-1 and arr_idxs[i+1] - arr_idxs[i] == 1 \
                and arr_idxs[i]+2 <= len(df_interp)-1:
                index_dict[first_idx-1] = arr_idxs[i]+2

            elif arr_idxs[i+1] - arr_idxs[i] > 1 and i+1 < len(arr_idxs)-1:
                index_dict[first_idx-1] = arr_idxs[i]+1
                first_idx = arr_idxs[i+1]
            else:
                continue

        for k,v in index_dict.items():
            diff = abs(df_interp.loc[k, "Mileage"] - df_interp.loc[v, "Mileage"])
            if diff < 1:
                df_interp.loc[k+1 : v, "Mileage"] = np.nan
                df_interp.loc[k+1 : v, "SOC"] = np.nan

        df_interp = df_interp.interpolate('linear').reset_index()

    df_interp.set_index("Timestamp", inplace = True)

    df = pd.concat([df_fill, df_interp], axis=1)
    
    # save datetime
    df.insert(0, 'datetime', df.index)
    
    # get day of week
    df['weekday'] = df['datetime'].dt.day_of_week
    # find first Monday
    idx = 0
    for i in df['weekday']:
        if i == 0: # 0 = Monday
            break
        idx += 1
    # start time series on Monday
    df = df.iloc[idx:, :]

    # end time series on June 2020
    df = df[df['datetime'] < LAST_DATE_CONST]
    df.drop(columns=['weekday'], inplace=True)
    df.reset_index(drop=True, inplace=True) 

    if df.shape[0] == 0:
        print (f"Empty dataframe, skipping..")
        return 0
    else:
        df.to_csv(f'{STORAGE_DIR}/{SAMPLING_RATE}/{df["VIN"][0].replace("/", "_")}.csv', index=False)
        return 1
